{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 特征工程"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 异常值处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from  matplotlib import pyplot as plt\n",
    "import seaborn as sns\n",
    "import scipy.stats as ss\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>A</th>\n",
       "      <th>B</th>\n",
       "      <th>C</th>\n",
       "      <th>D</th>\n",
       "      <th>E</th>\n",
       "      <th>F</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>a0</td>\n",
       "      <td>b0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.1</td>\n",
       "      <td>10.0</td>\n",
       "      <td>f0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>a1</td>\n",
       "      <td>b1</td>\n",
       "      <td>2.0</td>\n",
       "      <td>10.2</td>\n",
       "      <td>19.0</td>\n",
       "      <td>f2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>a1</td>\n",
       "      <td>b2</td>\n",
       "      <td>NaN</td>\n",
       "      <td>11.4</td>\n",
       "      <td>32.0</td>\n",
       "      <td>g2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>a2</td>\n",
       "      <td>b2</td>\n",
       "      <td>3.0</td>\n",
       "      <td>8.9</td>\n",
       "      <td>25.0</td>\n",
       "      <td>f3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>a3</td>\n",
       "      <td>b3</td>\n",
       "      <td>4.0</td>\n",
       "      <td>9.1</td>\n",
       "      <td>8.0</td>\n",
       "      <td>f4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>5</td>\n",
       "      <td>a4</td>\n",
       "      <td>None</td>\n",
       "      <td>5.0</td>\n",
       "      <td>12.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>f5</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    A     B    C     D     E   F\n",
       "0  a0    b0  1.0   0.1  10.0  f0\n",
       "1  a1    b1  2.0  10.2  19.0  f2\n",
       "2  a1    b2  NaN  11.4  32.0  g2\n",
       "3  a2    b2  3.0   8.9  25.0  f3\n",
       "4  a3    b3  4.0   9.1   8.0  f4\n",
       "5  a4  None  5.0  12.0   NaN  f5"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df1 = pd.DataFrame({'A':['a0','a1','a1','a2','a3','a4'],\n",
    "                    'B':['b0','b1','b2','b2','b3',None],\n",
    "                    'C':[1,2,None,3,4,5],\n",
    "                    'D':[0.1,10.2,11.4,8.9,9.1,12],\n",
    "                    'E':[10,19,32,25,8,None],\n",
    "                    'F':['f0','f2','g2','f3','f4','f5']})\n",
    "df1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>A</th>\n",
       "      <th>B</th>\n",
       "      <th>C</th>\n",
       "      <th>D</th>\n",
       "      <th>E</th>\n",
       "      <th>F</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>5</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       A      B      C      D      E      F\n",
       "0  False  False  False  False  False  False\n",
       "1  False  False  False  False  False  False\n",
       "2  False  False   True  False  False  False\n",
       "3  False  False  False  False  False  False\n",
       "4  False  False  False  False  False  False\n",
       "5  False   True  False  False   True  False"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 识别空值\n",
    "df1.isnull()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>A</th>\n",
       "      <th>B</th>\n",
       "      <th>C</th>\n",
       "      <th>D</th>\n",
       "      <th>E</th>\n",
       "      <th>F</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>a0</td>\n",
       "      <td>b0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.1</td>\n",
       "      <td>10.0</td>\n",
       "      <td>f0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>a1</td>\n",
       "      <td>b1</td>\n",
       "      <td>2.0</td>\n",
       "      <td>10.2</td>\n",
       "      <td>19.0</td>\n",
       "      <td>f2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>a2</td>\n",
       "      <td>b2</td>\n",
       "      <td>3.0</td>\n",
       "      <td>8.9</td>\n",
       "      <td>25.0</td>\n",
       "      <td>f3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>a3</td>\n",
       "      <td>b3</td>\n",
       "      <td>4.0</td>\n",
       "      <td>9.1</td>\n",
       "      <td>8.0</td>\n",
       "      <td>f4</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    A   B    C     D     E   F\n",
       "0  a0  b0  1.0   0.1  10.0  f0\n",
       "1  a1  b1  2.0  10.2  19.0  f2\n",
       "3  a2  b2  3.0   8.9  25.0  f3\n",
       "4  a3  b3  4.0   9.1   8.0  f4"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 删除空值\n",
    "df1.dropna()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>A</th>\n",
       "      <th>B</th>\n",
       "      <th>C</th>\n",
       "      <th>D</th>\n",
       "      <th>E</th>\n",
       "      <th>F</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>a0</td>\n",
       "      <td>b0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.1</td>\n",
       "      <td>10.0</td>\n",
       "      <td>f0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>a1</td>\n",
       "      <td>b1</td>\n",
       "      <td>2.0</td>\n",
       "      <td>10.2</td>\n",
       "      <td>19.0</td>\n",
       "      <td>f2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>a1</td>\n",
       "      <td>b2</td>\n",
       "      <td>NaN</td>\n",
       "      <td>11.4</td>\n",
       "      <td>32.0</td>\n",
       "      <td>g2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>a2</td>\n",
       "      <td>b2</td>\n",
       "      <td>3.0</td>\n",
       "      <td>8.9</td>\n",
       "      <td>25.0</td>\n",
       "      <td>f3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>a3</td>\n",
       "      <td>b3</td>\n",
       "      <td>4.0</td>\n",
       "      <td>9.1</td>\n",
       "      <td>8.0</td>\n",
       "      <td>f4</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    A   B    C     D     E   F\n",
       "0  a0  b0  1.0   0.1  10.0  f0\n",
       "1  a1  b1  2.0  10.2  19.0  f2\n",
       "2  a1  b2  NaN  11.4  32.0  g2\n",
       "3  a2  b2  3.0   8.9  25.0  f3\n",
       "4  a3  b3  4.0   9.1   8.0  f4"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df1.dropna(subset=['B'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    False\n",
       "1    False\n",
       "2     True\n",
       "3    False\n",
       "4    False\n",
       "5    False\n",
       "dtype: bool"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 识别重复值\n",
    "df1.duplicated(['A'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>A</th>\n",
       "      <th>B</th>\n",
       "      <th>C</th>\n",
       "      <th>D</th>\n",
       "      <th>E</th>\n",
       "      <th>F</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>a0</td>\n",
       "      <td>b0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.1</td>\n",
       "      <td>10.0</td>\n",
       "      <td>f0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>a1</td>\n",
       "      <td>b1</td>\n",
       "      <td>2.0</td>\n",
       "      <td>10.2</td>\n",
       "      <td>19.0</td>\n",
       "      <td>f2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>a2</td>\n",
       "      <td>b2</td>\n",
       "      <td>3.0</td>\n",
       "      <td>8.9</td>\n",
       "      <td>25.0</td>\n",
       "      <td>f3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>a3</td>\n",
       "      <td>b3</td>\n",
       "      <td>4.0</td>\n",
       "      <td>9.1</td>\n",
       "      <td>8.0</td>\n",
       "      <td>f4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>5</td>\n",
       "      <td>a4</td>\n",
       "      <td>None</td>\n",
       "      <td>5.0</td>\n",
       "      <td>12.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>f5</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    A     B    C     D     E   F\n",
       "0  a0    b0  1.0   0.1  10.0  f0\n",
       "1  a1    b1  2.0  10.2  19.0  f2\n",
       "3  a2    b2  3.0   8.9  25.0  f3\n",
       "4  a3    b3  4.0   9.1   8.0  f4\n",
       "5  a4  None  5.0  12.0   NaN  f5"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 删除重复值\n",
    "df1.drop_duplicates(['A'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>A</th>\n",
       "      <th>B</th>\n",
       "      <th>C</th>\n",
       "      <th>D</th>\n",
       "      <th>E</th>\n",
       "      <th>F</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>a0</td>\n",
       "      <td>b0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.1</td>\n",
       "      <td>10.0</td>\n",
       "      <td>f0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>a2</td>\n",
       "      <td>b2</td>\n",
       "      <td>3.0</td>\n",
       "      <td>8.9</td>\n",
       "      <td>25.0</td>\n",
       "      <td>f3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>a3</td>\n",
       "      <td>b3</td>\n",
       "      <td>4.0</td>\n",
       "      <td>9.1</td>\n",
       "      <td>8.0</td>\n",
       "      <td>f4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>5</td>\n",
       "      <td>a4</td>\n",
       "      <td>None</td>\n",
       "      <td>5.0</td>\n",
       "      <td>12.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>f5</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    A     B    C     D     E   F\n",
       "0  a0    b0  1.0   0.1  10.0  f0\n",
       "3  a2    b2  3.0   8.9  25.0  f3\n",
       "4  a3    b3  4.0   9.1   8.0  f4\n",
       "5  a4  None  5.0  12.0   NaN  f5"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df1.drop_duplicates(['A'],keep=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>A</th>\n",
       "      <th>B</th>\n",
       "      <th>C</th>\n",
       "      <th>D</th>\n",
       "      <th>E</th>\n",
       "      <th>F</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>a0</td>\n",
       "      <td>b0</td>\n",
       "      <td>1</td>\n",
       "      <td>0.1</td>\n",
       "      <td>10</td>\n",
       "      <td>f0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>a1</td>\n",
       "      <td>b1</td>\n",
       "      <td>2</td>\n",
       "      <td>10.2</td>\n",
       "      <td>19</td>\n",
       "      <td>f2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>a1</td>\n",
       "      <td>b2</td>\n",
       "      <td>b*</td>\n",
       "      <td>11.4</td>\n",
       "      <td>32</td>\n",
       "      <td>g2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>a2</td>\n",
       "      <td>b2</td>\n",
       "      <td>3</td>\n",
       "      <td>8.9</td>\n",
       "      <td>25</td>\n",
       "      <td>f3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>a3</td>\n",
       "      <td>b3</td>\n",
       "      <td>4</td>\n",
       "      <td>9.1</td>\n",
       "      <td>8</td>\n",
       "      <td>f4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>5</td>\n",
       "      <td>a4</td>\n",
       "      <td>b*</td>\n",
       "      <td>5</td>\n",
       "      <td>12.0</td>\n",
       "      <td>b*</td>\n",
       "      <td>f5</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    A   B   C     D   E   F\n",
       "0  a0  b0   1   0.1  10  f0\n",
       "1  a1  b1   2  10.2  19  f2\n",
       "2  a1  b2  b*  11.4  32  g2\n",
       "3  a2  b2   3   8.9  25  f3\n",
       "4  a3  b3   4   9.1   8  f4\n",
       "5  a4  b*   5  12.0  b*  f5"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 填充异常值B\n",
    "df1.fillna('b*')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>A</th>\n",
       "      <th>B</th>\n",
       "      <th>C</th>\n",
       "      <th>D</th>\n",
       "      <th>E</th>\n",
       "      <th>F</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>a0</td>\n",
       "      <td>b0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.1</td>\n",
       "      <td>10.0</td>\n",
       "      <td>f0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>a1</td>\n",
       "      <td>b1</td>\n",
       "      <td>2.0</td>\n",
       "      <td>10.2</td>\n",
       "      <td>19.0</td>\n",
       "      <td>f2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>a1</td>\n",
       "      <td>b2</td>\n",
       "      <td>18.8</td>\n",
       "      <td>11.4</td>\n",
       "      <td>32.0</td>\n",
       "      <td>g2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>a2</td>\n",
       "      <td>b2</td>\n",
       "      <td>3.0</td>\n",
       "      <td>8.9</td>\n",
       "      <td>25.0</td>\n",
       "      <td>f3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>a3</td>\n",
       "      <td>b3</td>\n",
       "      <td>4.0</td>\n",
       "      <td>9.1</td>\n",
       "      <td>8.0</td>\n",
       "      <td>f4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>5</td>\n",
       "      <td>a4</td>\n",
       "      <td>18.8</td>\n",
       "      <td>5.0</td>\n",
       "      <td>12.0</td>\n",
       "      <td>18.8</td>\n",
       "      <td>f5</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    A     B     C     D     E   F\n",
       "0  a0    b0   1.0   0.1  10.0  f0\n",
       "1  a1    b1   2.0  10.2  19.0  f2\n",
       "2  a1    b2  18.8  11.4  32.0  g2\n",
       "3  a2    b2   3.0   8.9  25.0  f3\n",
       "4  a3    b3   4.0   9.1   8.0  f4\n",
       "5  a4  18.8   5.0  12.0  18.8  f5"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 填充异常值E\n",
    "df1.fillna(df1['E'].mean())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    10.0\n",
       "1    19.0\n",
       "2    32.0\n",
       "3    25.0\n",
       "4     8.0\n",
       "5     8.0\n",
       "Name: E, dtype: float64"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 插值法填充,在末尾取上面一个数，在开头取下面的数，在中间，取两个数的均值\n",
    "df1['E'].interpolate()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0     1.0\n",
       "1     2.5\n",
       "2     4.0\n",
       "3     5.0\n",
       "4    20.0\n",
       "dtype: float64"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.Series([1,None,4,5,20]).interpolate()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    10.000000\n",
       "1    19.000000\n",
       "2    32.000000\n",
       "3    25.000000\n",
       "4     8.000000\n",
       "5   -20.143603\n",
       "Name: E, dtype: float64"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 3次样条插值法\n",
    "df1['E'].interpolate(method='spline',order=3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "f:\\condaenv\\py36\\lib\\site-packages\\ipykernel_launcher.py:6: UserWarning: Boolean Series key will be reindexed to match DataFrame index.\n",
      "  \n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>A</th>\n",
       "      <th>B</th>\n",
       "      <th>C</th>\n",
       "      <th>D</th>\n",
       "      <th>E</th>\n",
       "      <th>F</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>a1</td>\n",
       "      <td>b1</td>\n",
       "      <td>2.0</td>\n",
       "      <td>10.2</td>\n",
       "      <td>19.0</td>\n",
       "      <td>f2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>a1</td>\n",
       "      <td>b2</td>\n",
       "      <td>NaN</td>\n",
       "      <td>11.4</td>\n",
       "      <td>32.0</td>\n",
       "      <td>g2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>a2</td>\n",
       "      <td>b2</td>\n",
       "      <td>3.0</td>\n",
       "      <td>8.9</td>\n",
       "      <td>25.0</td>\n",
       "      <td>f3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>a3</td>\n",
       "      <td>b3</td>\n",
       "      <td>4.0</td>\n",
       "      <td>9.1</td>\n",
       "      <td>8.0</td>\n",
       "      <td>f4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>5</td>\n",
       "      <td>a4</td>\n",
       "      <td>None</td>\n",
       "      <td>5.0</td>\n",
       "      <td>12.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>f5</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    A     B    C     D     E   F\n",
       "1  a1    b1  2.0  10.2  19.0  f2\n",
       "2  a1    b2  NaN  11.4  32.0  g2\n",
       "3  a2    b2  3.0   8.9  25.0  f3\n",
       "4  a3    b3  4.0   9.1   8.0  f4\n",
       "5  a4  None  5.0  12.0   NaN  f5"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 使用四分数的上下界过滤D属性\n",
    "upper_q = df1['D'].quantile(0.75)\n",
    "lower_q = df1['D'].quantile(0.25)\n",
    "q_int = upper_q - lower_q\n",
    "k = 1.5\n",
    "df1[df1['D']>lower_q-k*q_int][df1['D']<upper_q+k*q_int]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>A</th>\n",
       "      <th>B</th>\n",
       "      <th>C</th>\n",
       "      <th>D</th>\n",
       "      <th>E</th>\n",
       "      <th>F</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>a0</td>\n",
       "      <td>b0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.1</td>\n",
       "      <td>10.0</td>\n",
       "      <td>f0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>a1</td>\n",
       "      <td>b1</td>\n",
       "      <td>2.0</td>\n",
       "      <td>10.2</td>\n",
       "      <td>19.0</td>\n",
       "      <td>f2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>a2</td>\n",
       "      <td>b2</td>\n",
       "      <td>3.0</td>\n",
       "      <td>8.9</td>\n",
       "      <td>25.0</td>\n",
       "      <td>f3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>a3</td>\n",
       "      <td>b3</td>\n",
       "      <td>4.0</td>\n",
       "      <td>9.1</td>\n",
       "      <td>8.0</td>\n",
       "      <td>f4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>5</td>\n",
       "      <td>a4</td>\n",
       "      <td>None</td>\n",
       "      <td>5.0</td>\n",
       "      <td>12.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>f5</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    A     B    C     D     E   F\n",
       "0  a0    b0  1.0   0.1  10.0  f0\n",
       "1  a1    b1  2.0  10.2  19.0  f2\n",
       "3  a2    b2  3.0   8.9  25.0  f3\n",
       "4  a3    b3  4.0   9.1   8.0  f4\n",
       "5  a4  None  5.0  12.0   NaN  f5"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 过滤F属性中的异常值\n",
    "df1[[True if item.startswith('f') else False for item in list(df1['F'].values)]]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 特征预处理"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 特征选择\n",
    "特征选择使用的是样本作为估计量，而正是建模是使用全量数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>A</th>\n",
       "      <th>B</th>\n",
       "      <th>C</th>\n",
       "      <th>D</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>0.888145</td>\n",
       "      <td>0.543618</td>\n",
       "      <td>-0.960614</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>0.462175</td>\n",
       "      <td>0.034925</td>\n",
       "      <td>-0.234244</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>-0.064108</td>\n",
       "      <td>1.743161</td>\n",
       "      <td>-0.792092</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>0.943182</td>\n",
       "      <td>-0.074923</td>\n",
       "      <td>-0.879533</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>-0.268589</td>\n",
       "      <td>0.254662</td>\n",
       "      <td>0.688653</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>5</td>\n",
       "      <td>1.075152</td>\n",
       "      <td>-1.233661</td>\n",
       "      <td>-0.036948</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>6</td>\n",
       "      <td>0.594596</td>\n",
       "      <td>-1.114602</td>\n",
       "      <td>0.003042</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>7</td>\n",
       "      <td>0.677053</td>\n",
       "      <td>0.240538</td>\n",
       "      <td>1.740412</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>8</td>\n",
       "      <td>0.945887</td>\n",
       "      <td>0.367086</td>\n",
       "      <td>0.165444</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>9</td>\n",
       "      <td>-0.395973</td>\n",
       "      <td>-1.135481</td>\n",
       "      <td>0.409995</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          A         B         C  D\n",
       "0  0.888145  0.543618 -0.960614  1\n",
       "1  0.462175  0.034925 -0.234244  0\n",
       "2 -0.064108  1.743161 -0.792092  0\n",
       "3  0.943182 -0.074923 -0.879533  0\n",
       "4 -0.268589  0.254662  0.688653  1\n",
       "5  1.075152 -1.233661 -0.036948  0\n",
       "6  0.594596 -1.114602  0.003042  1\n",
       "7  0.677053  0.240538  1.740412  1\n",
       "8  0.945887  0.367086  0.165444  0\n",
       "9 -0.395973 -1.135481  0.409995  0"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df2 = pd.DataFrame({'A':ss.norm.rvs(size=10),'B':ss.norm.rvs(size=10),\n",
    "                    'C':ss.norm.rvs(size=10),'D':np.random.randint(low=0,high=2,size=10)})\n",
    "df2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    1\n",
       "1    0\n",
       "2    0\n",
       "3    0\n",
       "4    1\n",
       "5    0\n",
       "6    1\n",
       "7    1\n",
       "8    0\n",
       "9    0\n",
       "Name: D, dtype: int32"
      ]
     },
     "execution_count": 48,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 引入SVR回归器和决策树回归器\n",
    "from sklearn.svm import SVR\n",
    "from sklearn.tree import DecisionTreeRegressor\n",
    "# 引入特征选择的三种思想的包\n",
    "from sklearn.feature_selection import SelectKBest,RFE,SelectFromModel\n",
    "# 特征和标注\n",
    "X = df2.loc[:,['A','B','C']]\n",
    "Y = df2.loc[:,'D']\n",
    "Y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 0.88814511, -0.96061358],\n",
       "       [ 0.46217474, -0.23424413],\n",
       "       [-0.06410792, -0.79209155],\n",
       "       [ 0.94318202, -0.87953263],\n",
       "       [-0.26858863,  0.68865281],\n",
       "       [ 1.07515191, -0.03694783],\n",
       "       [ 0.59459605,  0.00304236],\n",
       "       [ 0.67705281,  1.74041243],\n",
       "       [ 0.94588657,  0.16544364],\n",
       "       [-0.39597342,  0.4099947 ]])"
      ]
     },
     "execution_count": 50,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 1.过滤思想\n",
    "skb = SelectKBest(k=2)\n",
    "skb.fit(X,Y)\n",
    "skb.transform(X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 0.54361779, -0.96061358],\n",
       "       [ 0.03492471, -0.23424413],\n",
       "       [ 1.74316121, -0.79209155],\n",
       "       [-0.07492272, -0.87953263],\n",
       "       [ 0.25466202,  0.68865281],\n",
       "       [-1.23366086, -0.03694783],\n",
       "       [-1.11460221,  0.00304236],\n",
       "       [ 0.24053831,  1.74041243],\n",
       "       [ 0.36708556,  0.16544364],\n",
       "       [-1.1354814 ,  0.4099947 ]])"
      ]
     },
     "execution_count": 51,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 2.包裹思想\n",
    "rfe = RFE(estimator=SVR(kernel='linear'),n_features_to_select=2,step=1)\n",
    "rfe.fit_transform(X,Y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 0.54361779, -0.96061358],\n",
       "       [ 0.03492471, -0.23424413],\n",
       "       [ 1.74316121, -0.79209155],\n",
       "       [-0.07492272, -0.87953263],\n",
       "       [ 0.25466202,  0.68865281],\n",
       "       [-1.23366086, -0.03694783],\n",
       "       [-1.11460221,  0.00304236],\n",
       "       [ 0.24053831,  1.74041243],\n",
       "       [ 0.36708556,  0.16544364],\n",
       "       [-1.1354814 ,  0.4099947 ]])"
      ]
     },
     "execution_count": 52,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 3.嵌入思想\n",
    "# threshold=0.1 为阈值\n",
    "sfm = SelectFromModel(estimator=DecisionTreeRegressor(),threshold=0.1)\n",
    "sfm.fit_transform(X,Y)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 特征变换-离散化"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[6, 8, 10, 15, 16, 24, 15, 40, 67]"
      ]
     },
     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lst = [6,8,10,15,16,24,15,40,67]\n",
    "lst"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(5.999, 13.333], (5.999, 13.333], (5.999, 13.333], (13.333, 18.667], (13.333, 18.667], (18.667, 67.0], (13.333, 18.667], (18.667, 67.0], (18.667, 67.0]]\n",
       "Categories (3, interval[float64]): [(5.999, 13.333] < (13.333, 18.667] < (18.667, 67.0]]"
      ]
     },
     "execution_count": 55,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 等身（等频）分箱\n",
    "pd.qcut(lst,q=3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[low, low, low, medium, medium, high, medium, high, high]\n",
       "Categories (3, object): [low < medium < high]"
      ]
     },
     "execution_count": 57,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.qcut(lst,q=3,labels=['low','medium','high'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(5.939, 26.333], (5.939, 26.333], (5.939, 26.333], (5.939, 26.333], (5.939, 26.333], (5.939, 26.333], (5.939, 26.333], (26.333, 46.667], (46.667, 67.0]]\n",
       "Categories (3, interval[float64]): [(5.939, 26.333] < (26.333, 46.667] < (46.667, 67.0]]"
      ]
     },
     "execution_count": 58,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 等宽分箱\n",
    "pd.cut(lst,bins=3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[low, low, low, low, low, low, low, medium, high]\n",
       "Categories (3, object): [low < medium < high]"
      ]
     },
     "execution_count": 59,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.cut(lst,bins=3,labels=['low','medium','high'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 特征变换-归一化和标准化"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 引入标准化和归一化的函数\n",
    "from sklearn.preprocessing import MinMaxScaler,StandardScaler"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0.  ],\n",
       "       [0.15],\n",
       "       [0.45],\n",
       "       [0.7 ],\n",
       "       [1.  ]])"
      ]
     },
     "execution_count": 61,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 归一化，reshape(-1,1)的意思：-1是不要求有几行，但一定要有1列\n",
    "MinMaxScaler().fit_transform(np.array([1,4,10,15,21]).reshape(-1,1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 1.],\n",
       "       [ 1.],\n",
       "       [ 1.],\n",
       "       [ 1.],\n",
       "       [-1.],\n",
       "       [-1.],\n",
       "       [-1.],\n",
       "       [-1.]])"
      ]
     },
     "execution_count": 62,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 标准化\n",
    "StandardScaler().fit_transform(np.array([1,1,1,1,0,0,0,0]).reshape(-1,1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 2.64575131],\n",
       "       [-0.37796447],\n",
       "       [-0.37796447],\n",
       "       [-0.37796447],\n",
       "       [-0.37796447],\n",
       "       [-0.37796447],\n",
       "       [-0.37796447],\n",
       "       [-0.37796447]])"
      ]
     },
     "execution_count": 63,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 标准化\n",
    "StandardScaler().fit_transform(np.array([1,0,0,0,0,0,0,0]).reshape(-1,1))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 特征变换-数值化"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 引入标签化和独热编码\n",
    "from sklearn.preprocessing import LabelEncoder,OneHotEncoder"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0, 1, 0, 1], dtype=int64)"
      ]
     },
     "execution_count": 66,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 标签化\n",
    "LabelEncoder().fit_transform(np.array(['Down','Up','Down','Up']).reshape(-1,1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([1, 2, 0, 2, 1], dtype=int64)"
      ]
     },
     "execution_count": 67,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 首字母做升序，进行排序并编码\n",
    "LabelEncoder().fit_transform(np.array(['Low','Medium','High','Medium','Low']).reshape(-1,1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([2, 3, 0, 1], dtype=int64)"
      ]
     },
     "execution_count": 69,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 独热编码\n",
    "# 独热编码前需要先进行标签化\n",
    "lb_encoder = LabelEncoder()\n",
    "lb_tran_f = lb_encoder.fit_transform(np.array(['Red','Yellow','Blue','Green']))\n",
    "lb_tran_f"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "f:\\condaenv\\py36\\lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:415: FutureWarning: The handling of integer data will change in version 0.22. Currently, the categories are determined based on the range [0, max(values)], while in the future they will be determined based on the unique values.\n",
      "If you want the future behaviour and silence this warning, you can specify \"categories='auto'\".\n",
      "In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.\n",
      "  warnings.warn(msg, FutureWarning)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "array([[0., 0., 0., 1.],\n",
       "       [1., 0., 0., 0.],\n",
       "       [0., 1., 0., 0.],\n",
       "       [0., 1., 0., 0.],\n",
       "       [0., 0., 1., 0.]])"
      ]
     },
     "execution_count": 75,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 独热编码拟合\n",
    "oht_encoder = OneHotEncoder().fit(lb_tran_f.reshape(-1,1))\n",
    "# transform\n",
    "oht_encoder.transform(lb_encoder.transform(np.array(['Yellow','Blue','Green','Green','Red']))\n",
    "                      .reshape(-1,1)).toarray()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 特征变换-正规化"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 导入正规化包\n",
    "from sklearn.preprocessing import Normalizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 0.125,  0.125,  0.375, -0.125,  0.25 ]])"
      ]
     },
     "execution_count": 78,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# L1正则化\n",
    "Normalizer(norm='l1').fit_transform(np.array([1,1,3,-1,2]).reshape(1,-1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 0.25,  0.25,  0.75, -0.25,  0.5 ]])"
      ]
     },
     "execution_count": 80,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# L2正则化\n",
    "Normalizer(norm='l2').fit_transform(np.array([[1,1,3,-1,2]]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 特征降维-LDA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 导入LDA\n",
    "from sklearn.discriminant_analysis import LinearDiscriminantAnalysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[-1.73205081],\n",
       "       [-1.73205081],\n",
       "       [-3.46410162],\n",
       "       [ 1.73205081],\n",
       "       [ 1.73205081],\n",
       "       [ 3.46410162]])"
      ]
     },
     "execution_count": 83,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# LDA降维，有监督降维，使同一标注间尽可能小，不同标注间尽可能大\n",
    "X = np.array([[-1,-1],[-2,-1],[-3,-2],[1,1],[2,1],[3,2]])\n",
    "Y = np.array([1,1,1,2,2,2])\n",
    "LinearDiscriminantAnalysis(n_components=1).fit_transform(X,Y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([2])"
      ]
     },
     "execution_count": 84,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# LDA可以当做一个判别器（分类器）来用，也叫作fisher classification\n",
    "clf = LinearDiscriminantAnalysis(n_components=1).fit(X,Y)\n",
    "clf.predict([[0.8,1]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
